Disclaimer: Claude AI was used in this assignment
- Removed categorical columns
- Perfomed Linear Regression
- Reports:
- Data drift
- Regression quality metrics
In [ ]:
import warnings
warnings.filterwarnings('ignore')
In [49]:
# Install evidently if needed
# pip install evidently
import evidently
print(f"Evidently version: {evidently.__version__}")
Evidently version: 0.6.4
In [50]:
import pandas as pd
import numpy as np
import requests
import io
from datetime import datetime, time
from sklearn import datasets, ensemble
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import DatasetDriftMetric, RegressionQualityMetric
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
In [51]:
file_path = "/Users/brunamedeiros/Documents/University of Chicago/Summer 2025 - ML Ops/HW4/cancer_reg.csv"
df = pd.read_csv(file_path, encoding='latin-1')
print(f"Dataset shape: {df.shape}")
print(f"Missing values:\n{df.isnull().sum()}")
Dataset shape: (3047, 34) Missing values: avgAnnCount 0 avgDeathsPerYear 0 TARGET_deathRate 0 incidenceRate 0 medIncome 0 popEst2015 0 povertyPercent 0 studyPerCap 0 binnedInc 0 MedianAge 0 MedianAgeMale 0 MedianAgeFemale 0 Geography 0 AvgHouseholdSize 0 PercentMarried 0 PctNoHS18_24 0 PctHS18_24 0 PctSomeCol18_24 2285 PctBachDeg18_24 0 PctHS25_Over 0 PctBachDeg25_Over 0 PctEmployed16_Over 152 PctUnemployed16_Over 0 PctPrivateCoverage 0 PctPrivateCoverageAlone 609 PctEmpPrivCoverage 0 PctPublicCoverage 0 PctPublicCoverageAlone 0 PctWhite 0 PctBlack 0 PctAsian 0 PctOtherRace 0 PctMarriedHouseholds 0 BirthRate 0 dtype: int64
In [58]:
df.dropna(inplace=True)
df = df.drop(['Geography', 'binnedInc'], axis=1) # Drop categorical columns
# # Encode categorical variables
# categorical_cols = ['binnedInc', 'Geography']
# label_encoders = {}
# for col in categorical_cols:
# le = LabelEncoder()
# df[col] = le.fit_transform(df[col])
# label_encoders[col] = le # store encoder if needed later
# create datasets
df_A = df.copy()
df_A['medIncome'] -= 40000
df_AB = df_A.copy()
df_AB['povertyPercent'] += 20
df_ABC = df_AB.copy()
df_ABC['AvgHouseholdSize'] += 2
datasets = {
"df": df,
"df_A": df_A,
"df_AB": df_AB,
"df_ABC": df_ABC
}
print("You have 4 datasets:")
for name in datasets:
print(f"- {name}")
# Split features and target
X_dict = {}
y_dict = {}
print(f"\nSplitting features and target...")
for name, data in datasets.items():
X_dict[name] = data.drop(columns=['TARGET_deathRate'])
y_dict[name] = data['TARGET_deathRate']
# Train-test split
print(f"\nTrain-test split...")
X_train_dict = {}
y_train_dict = {}
X_test_dict = {}
y_test_dict = {}
for name in datasets.keys():
X_train, X_test, y_train, y_test = train_test_split(X_dict[name], y_dict[name], test_size=0.2, random_state=42)
X_train_dict[name] = X_train
y_train_dict[name] = y_train
X_test_dict[name] = X_test
y_test_dict[name] = y_test
print(" Results:")
for name in datasets.keys():
print(f" {name} - Train shape: {X_train_dict[name].shape}, Test shape: {X_test_dict[name].shape}")
# Train models
print(f"\nTraining model on original dataset...")
model = LinearRegression()
model.fit(X_train_dict["df"], y_train_dict["df"]) # only train on original dataset
# EVIDENTLY REPORTS
# 1. Add predictions to your each test dataset
print(f"\nAdding predictions to test datasets (to each alternated DF)...")
complete_test_dict = {}
for name in datasets.keys():
complete_test_dict[name] = X_test_dict[name].copy()
complete_test_dict[name]['TARGET_deathRate'] = y_test_dict[name] # Add target
complete_test_dict[name]['prediction'] = model.predict(X_test_dict[name]) # Add predictions
# 2. Create column mapping
column_mapping = ColumnMapping()
column_mapping.target = 'TARGET_deathRate'
column_mapping.prediction = 'prediction'
# Evidently reports
report_A = Report(metrics=[
DatasetDriftMetric(), # Shows which features drifted
RegressionQualityMetric() # Shows how prediction quality changed
])
report_A.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_A"], column_mapping=column_mapping)
report_A.save_html("data_drift_A.html")
report_AB = Report(metrics=[DatasetDriftMetric(),RegressionQualityMetric()])
report_AB.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_AB"], column_mapping=column_mapping)
report_AB.save_html("data_drift_AB.html")
report_ABC = Report(metrics=[DatasetDriftMetric(),RegressionQualityMetric()])
report_ABC.run(reference_data=complete_test_dict["df"], current_data=complete_test_dict["df_ABC"], column_mapping=column_mapping)
report_ABC.save_html("data_drift_ABC.html")
You have 4 datasets:
- df
- df_A
- df_AB
- df_ABC
Splitting features and target...
Train-test split...
Results:
df - Train shape: (472, 31), Test shape: (119, 31)
df_A - Train shape: (472, 31), Test shape: (119, 31)
df_AB - Train shape: (472, 31), Test shape: (119, 31)
df_ABC - Train shape: (472, 31), Test shape: (119, 31)
Training model on original dataset...
Adding predictions to test datasets (to each alternated DF)...
In [59]:
report_A
Out[59]:
In [60]:
report_AB
Out[60]:
In [61]:
report_ABC
Out[61]:
Conclusion: Even though the data drift wasn't significant (smaller than the 0.5 threshold), it significantly impacted model performance. We can see MAE and MAPE worsened with every additional change on the dataset.